Renaming Column Names for easy code handling and Cleaning Data

e <- read_excel("~/SharedFiles/ST606/2020/data/Exercise/fit_database_anthropometric_all.xlsx")
e_normal <- read_excel("~/SharedFiles/ST606/2020/data/Exercise/fit_database_exercise_normal.xlsx")


#Renaming Variables

names(e)[names(e)=='measurement date']<- 'measurement_date'
names(e)[names(e)=='age (years)']<- 'age_years'
names(e)[names(e)=='age bin']<- 'age_bin'
#names(e)[names(e)=='z-category (WHO)']<- 'z_cat_WHO'
names(e)[length(e)]<-"z_cat_WHO"
names(e)[9]<-"z_score_WHO"
names(e)[6]<-"height_cm"
names(e)[7]<-"weight_kg"

#Dealing with NAs
ena<-na.omit(e)
any(is.na(ena$z_cat_WHO))
## [1] FALSE
#Changing Data Types
ena <- ena %>%
  mutate(gender = as.factor(gender),
         z_cat_WHO=as.factor(z_cat_WHO),
         measurement_date=as.Date(measurement_date),
         BMI = as.numeric(BMI),
         z_score_WHO=as.numeric(z_score_WHO),
         height_cm=as.numeric(height_cm),
         weight_kg=as.numeric(weight_kg))
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
#Removing "NA" Characters
ena$observation <- 1:nrow(ena)
x<-ena[ena$z_cat_WHO=="NA",]
y<-x$observation
ena<-ena[-y,]

#Adding additional column
ena$year <- year(ena$measurement_date)

Diving data as per Year

ena_2007 <- filter(ena, year(measurement_date) == 2007)
ena_2008 <- filter(ena, year(measurement_date) == 2008)
ena_2009 <- filter(ena, year(measurement_date) == 2009)
ena_2010 <- filter(ena, year(measurement_date) == 2010)
ena_2011 <- filter(ena, year(measurement_date) == 2011)
ena_2012 <- filter(ena, year(measurement_date) == 2012)
ena_2013 <- filter(ena, year(measurement_date) == 2013)
ena_2014 <- filter(ena, year(measurement_date) == 2014)
ena_2015 <- filter(ena, year(measurement_date) == 2015)
ena_2016 <- filter(ena, year(measurement_date) == 2016)
ena_2017 <- filter(ena, year(measurement_date) == 2017)
ena_2018 <- filter(ena, year(measurement_date) == 2018)

Checking uniques IDs whose observation has been taken continuously from 2007 to 2018

a_07_08 <- subset(ena_2007, ID %in% ena_2008$ID)
a_07_09 <- subset(a_07_08, ID %in% ena_2009$ID)
a_07_10 <- subset(a_07_09, ID %in% ena_2010$ID)
a_07_11 <- subset(a_07_10, ID %in% ena_2011$ID)
a_07_12 <- subset(a_07_11, ID %in% ena_2012$ID)
a_07_13 <- subset(a_07_12, ID %in% ena_2013$ID)
a_07_14 <- subset(a_07_13, ID %in% ena_2014$ID)
a_07_15 <- subset(a_07_14, ID %in% ena_2015$ID)
a_07_16 <- subset(a_07_15, ID %in% ena_2016$ID)
a_07_17 <- subset(a_07_16, ID %in% ena_2017$ID)
a_07_18 <- subset(a_07_17, ID %in% ena_2018$ID)

#a_07_18 <- kids data of 2007 whose obeservation were taken till the end of 2018 without any miss. 

a_unique_07<- subset(ena, ID %in% a_07_18$ID)

#a_unique_07 = All data of the a_07_18 dataset kids from 2007 to 2018.

#Measumements taken on Oct
Oct_07<-a_unique_07 %>% filter(month(measurement_date) %in% c(10))
by_year_cat_Oct_07<-Oct_07 %>% group_by(year, z_cat_WHO) %>% summarise(count=n())

ggplot(by_year_cat_Oct_07)+
  geom_col(aes(x=z_cat_WHO,y=count))+
  facet_wrap(~year)

Prop_07_18_Oct<-by_year_cat_Oct_07 %>% group_by(year) %>% 
            summarise(total=sum(count)) %>% 
            merge(., by_year_cat_Oct_07, all.y=TRUE) %>%
            mutate(Proportion=count*100/total) %>%
            select(year, z_cat_WHO, count, Proportion)


#Measumements taken on April
Apr_07<-a_unique_07 %>% filter(month(measurement_date) %in% c(4))
by_year_cat_Apr_07<-Apr_07 %>% group_by(year, z_cat_WHO) %>% summarise(count=n())

ggplot(by_year_cat_Apr_07)+
  geom_col(aes(x=z_cat_WHO,y=count))+
  facet_wrap(~year)

Prop_07_18_Apr<-by_year_cat_Apr_07 %>% group_by(year) %>% 
                summarise(total=sum(count)) %>% 
                merge(., by_year_cat_Apr_07, all.y=TRUE) %>%
                mutate(Proportion=count*100/total) %>%
                select(year, z_cat_WHO, count, Proportion)


#Checking the number of age counts in 2007
age_check_07 <- a_07_18 %>% group_by(age_bin, year) %>% summarise(count=n())
#Checking height
#Checking the Shortest and Tallest in 2007 and check throughout growth
#Checking 5 shortest heights in 2007 (boy and girl)
shortest_07_boy <- a_07_18 %>% arrange(height_cm) %>% filter((gender) %in% c("boy"))%>% head(5)
shortest_07_boy_IDs<-shortest_07_boy$ID

shortest_07_girl <- a_07_18 %>% arrange(height_cm) %>% filter((gender) %in% c("girl")) %>% head(5)
shortest_07_girl_IDs<-shortest_07_girl$ID

#Checking prop increase per year
a_unique_07_pct<-a_unique_07 %>%
  group_by(ID) %>% 
  arrange(measurement_date, .by_group = TRUE) %>%
  mutate(pct_change = (height_cm/lag(height_cm) - 1) * 100) %>% 
  select(ID, measurement_date, height_cm, pct_change)

#Checking growth rate of the shortest kids from 2007 throught 2018
#BOY
shortest_boy<-a_unique_07_pct %>% filter((ID) %in% c(shortest_07_boy_IDs)) %>% 
  select(ID, measurement_date, height_cm, pct_change)

ggplot(shortest_boy, aes(x=measurement_date, y = pct_change ))+
  geom_line()+facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

#GIRL
shortest_girl<-a_unique_07_pct %>% filter((ID) %in% c(shortest_07_girl_IDs)) %>% 
  select(ID, measurement_date, height_cm, pct_change)

ggplot(shortest_girl, aes(x=measurement_date, y = pct_change ))+
  geom_line()+facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

#Negative growth rate check (Wrong Data)
shortest_girl%>% filter((ID)%in%c(262))
## # A tibble: 21 x 4
## # Groups:   ID [1]
##       ID measurement_date height_cm pct_change
##    <dbl> <date>               <dbl>      <dbl>
##  1   262 2007-10-01             116     NA    
##  2   262 2008-04-01             120      3.45 
##  3   262 2008-10-01             122      1.67 
##  4   262 2009-04-01             125      2.46 
##  5   262 2009-10-01             127      1.6  
##  6   262 2010-04-01             131      3.15 
##  7   262 2010-10-01             135      3.05 
##  8   262 2011-04-01             137      1.48 
##  9   262 2011-10-01             138      0.730
## 10   262 2012-04-01             141      2.17 
## # … with 11 more rows
#Checking 5 tallest heights in 2007 (boy and girl)

tallest_07_boy <- a_07_18 %>% arrange(desc(height_cm)) %>% filter((gender) %in% c("boy"))%>% head(5)
tallest_07_boy_IDs<-tallest_07_boy$ID

tallest_07_girl <- a_07_18 %>% arrange(desc(height_cm)) %>% filter((gender) %in% c("girl")) %>% head(5)
tallest_07_girl_IDs<-tallest_07_girl$ID

#Checking growth rate of the shortest kids from 2007 throught 2018
#BOY
tallest_boy<-a_unique_07_pct %>% filter((ID) %in% c(tallest_07_boy_IDs)) %>% 
  select(ID, measurement_date, height_cm, pct_change)

ggplot(tallest_boy, aes(x=measurement_date, y = pct_change ))+
  geom_line()+facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

#GIRL
tallest_girl<-a_unique_07_pct %>% filter((ID) %in% c(tallest_07_girl_IDs)) %>% 
  select(ID, measurement_date, height_cm, pct_change)

ggplot(tallest_girl, aes(x=measurement_date, y = pct_change ))+
  geom_line()+facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

#Negative growth rate check (Wrong Data)
tallest_girl%>% filter((ID)%in%c(340))
## # A tibble: 20 x 4
## # Groups:   ID [1]
##       ID measurement_date height_cm pct_change
##    <dbl> <date>               <dbl>      <dbl>
##  1   340 2007-10-01             137     NA    
##  2   340 2008-04-01             140      2.19 
##  3   340 2009-04-01             146      4.29 
##  4   340 2009-10-01             149      2.05 
##  5   340 2010-04-01             154      3.36 
##  6   340 2010-10-01             156      1.30 
##  7   340 2011-04-01             161      3.21 
##  8   340 2011-10-01             162      0.621
##  9   340 2012-04-01             163      0.617
## 10   340 2012-10-01             164      0.613
## 11   340 2013-04-01             165      0.610
## 12   340 2013-10-01             165      0    
## 13   340 2014-04-01             166      0.606
## 14   340 2014-10-01             165     -0.602
## 15   340 2015-04-01             168      1.82 
## 16   340 2016-04-01             168      0    
## 17   340 2016-10-01             167     -0.595
## 18   340 2017-04-01             177      5.99 
## 19   340 2017-10-01             167     -5.65 
## 20   340 2018-04-01             167      0
#Checking the height in 2018 and investigating their growth from 2007 to 2018.
#Checking the Shortest and Tallest in 2018 and check throughout growth

#Finding IDs of the shortest kids on 2018
shortest_boy_18<-a_unique_07 %>% filter((year(measurement_date)) %in% c(2018)) %>% 
  arrange(height_cm) %>% filter((gender) %in% c("boy")) %>% head(5)
shortest_boy_18_IDs <-shortest_boy_18$ID

shortest_girl_18<-a_unique_07 %>% filter((year(measurement_date)) %in% c(2018)) %>% 
  arrange(height_cm) %>% filter((gender) %in% c("girl")) %>% head(5)
shortest_girl_18_IDs <-shortest_girl_18$ID

#Checking growth rate of the shortest kids from 2007 throught 2018
#BOY
shortest_boy_all<-a_unique_07_pct %>% filter((ID) %in% c(shortest_boy_18_IDs)) %>% 
  select(ID, measurement_date, height_cm, pct_change)

ggplot(shortest_boy_all, aes(x=measurement_date, y = pct_change ))+
  geom_line()+facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

#GIRL
shortest_girl_all<-a_unique_07_pct %>% filter((ID) %in% c(shortest_girl_18_IDs)) %>% 
  select(ID, measurement_date, height_cm, pct_change)

ggplot(shortest_girl_all, aes(x=measurement_date, y = pct_change ))+
  geom_line()+facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

#Negative growth rate check (Wrong Data)
shortest_girl_all%>% filter((ID)%in%c(262))
## # A tibble: 21 x 4
## # Groups:   ID [1]
##       ID measurement_date height_cm pct_change
##    <dbl> <date>               <dbl>      <dbl>
##  1   262 2007-10-01             116     NA    
##  2   262 2008-04-01             120      3.45 
##  3   262 2008-10-01             122      1.67 
##  4   262 2009-04-01             125      2.46 
##  5   262 2009-10-01             127      1.6  
##  6   262 2010-04-01             131      3.15 
##  7   262 2010-10-01             135      3.05 
##  8   262 2011-04-01             137      1.48 
##  9   262 2011-10-01             138      0.730
## 10   262 2012-04-01             141      2.17 
## # … with 11 more rows
#Checking the height in 2018 and investigating their growth from 2007 to 2018.

#Finding IDs of the tallest kids on 2018
tallest_boy_18<-a_unique_07 %>% filter((year(measurement_date)) %in% c(2018)) %>% 
  arrange(desc(height_cm)) %>% filter((gender) %in% c("boy")) %>% head(5)
tallest_boy_18_IDs <-tallest_boy_18$ID

tallest_girl_18<-a_unique_07 %>% filter((year(measurement_date)) %in% c(2018)) %>% 
  arrange(desc(height_cm)) %>% filter((gender) %in% c("girl")) %>% head(5)
tallest_girl_18_IDs <-tallest_girl_18$ID

#Checking growth rate of the shortest kids from 2007 throught 2018
#BOY
tallest_boy_all<-a_unique_07_pct %>% filter((ID) %in% c(tallest_boy_18_IDs)) %>% 
  select(ID, measurement_date, height_cm, pct_change)

ggplot(tallest_boy_all, aes(x=measurement_date, y = pct_change ))+
  geom_line()+facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

#GIRL
tallest_girl_all<-a_unique_07_pct %>% filter((ID) %in% c(tallest_girl_18_IDs)) %>% 
  select(ID, measurement_date, height_cm, pct_change)

ggplot(tallest_girl_all, aes(x=measurement_date, y = pct_change ))+
  geom_line()+facet_wrap(~ID)+
  geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 1 rows containing missing values (geom_path).

#Negative growth rate check (Wrong Data)
tallest_girl_all%>% filter((ID)%in%c(898))
## # A tibble: 22 x 4
## # Groups:   ID [1]
##       ID measurement_date height_cm pct_change
##    <dbl> <date>               <dbl>      <dbl>
##  1   898 2007-10-01             142      NA   
##  2   898 2008-04-01             148       4.23
##  3   898 2008-10-01             148       0   
##  4   898 2009-04-01             152       2.70
##  5   898 2009-10-01             155       1.97
##  6   898 2010-04-01             160       3.23
##  7   898 2010-10-01             163       1.88
##  8   898 2011-04-01             169       3.68
##  9   898 2011-10-01             171       1.18
## 10   898 2012-04-01             171       0   
## # … with 12 more rows
#Checking growth of all boys and girls combined
mod <- sitar(x = age_years, y = height_cm, id = ID, data = a_unique_07, df = 5)
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)

#Growth curve for all IDs 
mplot(x = age_years, y = height_cm, id = ID, data = a_unique_07, col = ID, las = 1)

plot(mod, opt = 'a', col = ID, las = 1, xlim = xaxsd(), ylim = yaxsd())

#Growth curve for all IDs by boys and girls
mplot(x = age_years, y = height_cm, id = ID, data = a_unique_07, col = gender, las = 1)

plot(mod, opt = 'a', col = gender, las = 1, xlim = xaxsd(), ylim = yaxsd())
legend("bottomleft", legend=c("Black -> Boys", "Red -> Girls"), col=c("red", "blue"))

par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod, opt = 'd', las = 1, apv = TRUE)

##   apv    pv 
## 12.16  8.35
plot(mod, opt = 'v', las = 1, apv = TRUE, lty = 2)

##   apv    pv 
## 12.16  8.35
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod, opt = 'u', las = 1, col = 8, lwd = 0.5)
lines(mod, opt = 'd', lty = 2)
lines(mod, opt = 'ua', col = 4, subset = ID == 312)
lines(mod, opt = 'ua', col = 2, subset = ID == 277)
lines(mod, opt = 'ua', col = 6, subset = ID == 299)
lines(mod, opt = 'ua', col = 3, subset = ID == 310)
lines(mod, opt = 'ua', col = 1, subset = ID == 3387)
legend('bottomright', c('ID 312', 'ID 277','ID 299','ID 310','ID 3387','mean'),lty = c(1,1,1,1,1,2), col = c(4,2,6,3,8), cex = 0.8, inset=0.04)

pairs(ranef(mod), labels = c('size', 'timing', 'intensity'), pch=20)

#Dividing a_07_18 : boys and girls and show the growth curve : making 2 datasets from a_unique_07 (boys and girls) and fiting sitar

only_boy_07_18<-a_07_18 %>% filter((gender) %in% c('boy'))
boy_ID<-only_boy_07_18$ID
only_girl_07_18<-a_07_18 %>% filter((gender) %in% c('girl'))
girl_ID<-only_girl_07_18$ID
a_unique_07_boy<-a_unique_07 %>% filter((ID) %in% c(boy_ID))
a_unique_07_girl<-a_unique_07 %>% filter((ID) %in% c(girl_ID))

#Checking growth of only boys
mod_boy <- sitar(x = age_years, y = height_cm, id = ID, data = a_unique_07_boy, df = 5)
## Warning in nlme.formula(y ~ fitnlme(x, s1, s2, s3, s4, s5, a, b, c), fixed = s1
## + : Iteration 3, LME step: nlminb() did not converge (code = 1). PORT message:
## false convergence (8)
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)

#Growth curve for all boys IDs 
mplot(x = age_years, y = height_cm, id = ID, data = a_unique_07_boy, col = ID, las = 1)

plot(mod_boy, opt = 'a', col = ID, las = 1, xlim = xaxsd(), ylim = yaxsd())

par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod_boy, opt = 'd', las = 1, apv = TRUE)

##    apv     pv 
## 13.340  9.757
plot(mod_boy, opt = 'v', las = 1, apv = TRUE, lty = 2)

##    apv     pv 
## 13.340  9.757
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod_boy, opt = 'u', las = 1, col = 8, lwd = 0.5)
lines(mod_boy, opt = 'd', lty = 2)

pairs(ranef(mod_boy), labels = c('size', 'timing', 'intensity'), pch=20)

#Checking growth of only girls
mod_girl <- sitar(x = age_years, y = height_cm, id = ID, data = a_unique_07_girl, df = 5)
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)

#Growth curve for all boys IDs 
mplot(x = age_years, y = height_cm, id = ID, data = a_unique_07_girl, col = ID, las = 1)

plot(mod_girl, opt = 'a', col = ID, las = 1, xlim = xaxsd(), ylim = yaxsd())

par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod_girl, opt = 'd', las = 1, apv = TRUE)

##    apv     pv 
## 11.380  7.303
plot(mod_girl, opt = 'v', las = 1, apv = TRUE, lty = 2)

##    apv     pv 
## 11.380  7.303
par(mar = c(4,4,1,1) + 0.1, cex = 0.8)
plot(mod_girl, opt = 'u', las = 1, col = 8, lwd = 0.5)
lines(mod_girl, opt = 'd', lty = 2)

pairs(ranef(mod_girl), labels = c('size', 'timing', 'intensity'), pch=20)

e_normal<-read_excel("~/SharedFiles/ST606/2020/data/Exercise/fit_database_exercise_normal.xlsx", na="NA")
ena_normal <- na.omit(e_normal)
nrow(ena_normal)
## [1] 40518
colnames(ena_normal)
##  [1] "ID"                   "measurement date"     "age (years)"         
##  [4] "age bin"              "gender"               "height (cm)"         
##  [7] "weight (kg)"          "BMI"                  "z-score (WHO)"       
## [10] "z-category (WHO)"     "running distance (m)" "running time (s)"    
## [13] "running speed (m/s)"  "pulse 0'"             "pulse 1'"            
## [16] "pulse 5'"             "pulse 10'"            "SBP 0'"              
## [19] "SBP 1'"               "SBP 5'"               "SBP 10'"             
## [22] "DBP 0'"               "DBP 1'"               "DBP 5'"              
## [25] "DBP 10'"
#And Compare both shortest and tallest
#check weight and BMI and heartrate as well